This Rmarkdown file assesses the output of CheckV, DeepVirFinder, Kaiju, VIBRANT, VirSorter, and VirSorter2 on multiple training sets of microbial DNA, primarily from NCBI. Created from fungal, viral, bacterial, archeael, protist, and plasmid DNA sequences
Please reach out to James Riddell (riddell.26@buckeyemail.osu.edu) or Bridget Hegarty (beh53@case.edu) regarding any issues, or open an issue on github.
library(ggplot2)
There were 43 warnings (use warnings() to see them)
library(plyr)
library(reshape2)
library(viridis)
library(tidyr)
library(dplyr)
library(readr)
library(data.table)
library(pROC)
Import the file that combines the results from each of the tools from running “combining_tool_output.Rmd”:
viruses <- read_tsv("../IntermediaryFiles/viral_tools_combined.tsv")
── Column specification ──────────────────────────────────────────────────────────────────────────
cols(
.default = col_double(),
seqtype = col_character(),
contig = col_character(),
checkv_provirus = col_character(),
checkv_quality = col_character(),
method.x = col_character(),
Classified = col_character(),
IDs_all = col_character(),
Seq = col_character(),
Kaiju_Viral = col_character(),
Kingdom = col_character(),
type = col_character(),
vibrant_quality = col_character(),
method.y = col_character(),
vibrant_prophage = col_character(),
vs2type = col_character(),
max_score_group = col_character(),
provirus = col_logical()
)
ℹ Use `spec()` for the full column specifications.
This section defines a viralness score “keep_score” based on the tool classifications. A final keep_score above 1 indicates we will keep that sequence and call it viral.
VIBRANT Quality == “Complete Circular”: +1 Quality == “High Quality Draft”: +1 Quality == “Medium Quality Draft”: +1 Quality == “Low Quality Draft” & provirus: +0.5
Virsorter2 Viral >= 50: +0.5 Viral >= 0.95: +0.5 RNA >= 0.9: +1 lavidaviridae >= 0.9: +1 NCLDV >= 0.9: +1
Virsorter category == 1,4: +1 category == 2,5: +0.5
DeepVirFinder: Score >= 0.7: +0.5
Tuning - No Viral Signature: Kaiju_viral = “cellular organisms”: -0.5 If host_genes >50 and NOT provirus: -1 If viral_genes == 0 and host_genes >= 1: -1 If 3*viral_genes <= host_genes and NOT provirus: -1 If length > 50,000 and hallmark <=1: -1 If length < 5000 and checkv completeness <= 75: -0.5
Tuning - Viral Signature: Kaiju_viral = “Viruses”: +1 If %unknown >= 75 and length < 50000: +0.5 If %viral >= 50: +0.5 Hallmark > 2: +1
This script produces visualizations of these combined viral scorings and includes ecological metrics like alpha diversity.
You can decide which combination is appropriate for them and only need use the tools appropriate for your data.
getting_viral_set_1 <- function(input_seqs,
include_vibrant=FALSE,
include_virsorter2=FALSE,
include_deepvirfinder=FALSE,
include_tuning_viral=FALSE,
include_tuning_not_viral=FALSE,
include_virsorter=FALSE) {
keep_score <- rep(0, nrow(input_seqs))
if (include_vibrant) {
keep_score[input_seqs$vibrant_quality=="complete circular"] <- keep_score[input_seqs$vibrant_quality=="complete circular"] + 1
keep_score[input_seqs$vibrant_quality=="high quality draft"] <- keep_score[input_seqs$vibrant_quality=="high quality draft"] + 1
keep_score[input_seqs$vibrant_quality=="medium quality draft"] <- keep_score[input_seqs$vibrant_quality=="medium quality draft"] + 1
keep_score[input_seqs$vibrant_quality=="low quality draft" & input_seqs$provirus] <- keep_score[input_seqs$vibrant_quality=="low quality draft" & input_seqs$provirus] + 0.5
}
if (include_virsorter2) {
keep_score[input_seqs$viral>=50] <- keep_score[input_seqs$viral>=50] + 0.5
keep_score[input_seqs$viral>=95] <- keep_score[input_seqs$viral>=95] + 0.5
keep_score[input_seqs$RNA>=0.9] <- keep_score[input_seqs$RNA>=0.9] + 1
keep_score[input_seqs$lavidaviridae>=0.9] <- keep_score[input_seqs$lavidaviridae>=0.9] + 1
keep_score[input_seqs$NCLDV>=0.9] <- keep_score[input_seqs$NCLDV>=0.9] + 1
}
if (include_virsorter) {
keep_score[input_seqs$category==1] <- keep_score[input_seqs$category==1] + 1
keep_score[input_seqs$category==2] <- keep_score[input_seqs$category==2] + 0.5
keep_score[input_seqs$category==4] <- keep_score[input_seqs$category==4] + 1
keep_score[input_seqs$category==5] <- keep_score[input_seqs$category==5] + 0.5
}
if (include_deepvirfinder) {
keep_score[input_seqs$score>=0.7 & input_seqs$checkv_length<20000] <- keep_score[input_seqs$score>=0.7 & input_seqs$checkv_length<20000] + 0.5
keep_score[input_seqs$score>=0.9 & input_seqs$checkv_length<20000] <- keep_score[input_seqs$score>=0.9 & input_seqs$checkv_length<20000] + 0.5
}
if (include_tuning_viral) {
keep_score[input_seqs$Kaiju_Viral=="Viruses"] <- keep_score[input_seqs$Kaiju_Viral=="Viruses"] + 0.5
keep_score[input_seqs$hallmark>2] <- keep_score[input_seqs$hallmark>2] + 1
keep_score[input_seqs$percent_unknown>=75 & input_seqs$checkv_length<50000] <- keep_score[input_seqs$percent_unknown>=75 & input_seqs$checkv_length<50000] + 0.5
keep_score[input_seqs$percent_viral>=50] <- keep_score[input_seqs$percent_viral>=50] + 0.5
}
if (include_tuning_not_viral) {
keep_score[input_seqs$Kaiju_Viral=="cellular organisms"] <- keep_score[input_seqs$Kaiju_Viral=="cellular organisms"] - 0.5
keep_score[input_seqs$checkv_host_genes>50 & !input_seqs$provirus] <- keep_score[input_seqs$checkv_host_genes>50 & !input_seqs$provirus] - 1
keep_score[input_seqs$checkv_viral_genes==0 & input_seqs$checkv_host_genes>=1] <- keep_score[input_seqs$checkv_viral_genes==0 & input_seqs$checkv_host_genes>=1] - 1
keep_score[((input_seqs$checkv_viral_genes*3) <= input_seqs$checkv_host_genes) & !input_seqs$provirus] <- keep_score[((input_seqs$checkv_viral_genes*3) <= input_seqs$checkv_host_genes) & !input_seqs$provirus] - 1 # consider accounting for RNA viruses
keep_score[input_seqs$checkv_length>500000 & input_seqs$hallmark<=1] <- keep_score[input_seqs$checkv_length>500000 & input_seqs$hallmark<=1] - 1
keep_score[input_seqs$checkv_completeness<=75 & input_seqs$checkv_length<=5000] <- keep_score[input_seqs$checkv_completeness<=75 & input_seqs$checkv_length<=5000] - 0.5 # helped with protist contamination
}
return(keep_score)
}
note that this is only as accurate as the annotations of the input sequences
this function calculates the precision, recall, and F1 score for each pipeline
assess_performance <- function(seqtype, keep_score) {
truepositive <- rep("not viral", length(seqtype))
truepositive[seqtype=="virus"] <- "viral"
#make confusion matrix
confusion_matrix <- rep("true negative", length(keep_score))
confusion_matrix[truepositive=="viral" & keep_score<=1] <- "false negative"
confusion_matrix[truepositive=="viral" & keep_score>=1] <- "true positive"
confusion_matrix[truepositive=="not viral" & keep_score>=1] <- "false positive"
TP <- table(confusion_matrix)[4]
FP <- table(confusion_matrix)[2]
TN <- table(confusion_matrix)[3]
FN <- table(confusion_matrix)[1]
precision <- TP/(TP+FP)
recall <- TP/(TP+FN)
F1 <- 2*precision*recall/(precision+recall)
MCC <- (TP*TN-FP*FN)/sqrt(as.numeric(TP+FP)*as.numeric(TP+FN)*as.numeric(TN+FP)*as.numeric(TN+FN))
auc <- round(auc(truepositive, keep_score),4)
#by type metrics
fungal_FP <- table(confusion_matrix[seqtype=="fungi"])[2]
protist_FP <- table(confusion_matrix[seqtype=="protist"])[2]
bacterial_FP <- table(confusion_matrix[seqtype=="bacteria"])[2]
viral_FN <- table(confusion_matrix[seqtype=="virus"])[1]
performance <- c(precision, recall, F1, MCC, auc, fungal_FP,
protist_FP, bacterial_FP, viral_FN)
names(performance) <- c("precision", "recall", "F1", "MCC", "AUC", "fungal_FP",
"protist_FP", "bacterial_FP", "viral_FN")
return(performance)
}
combination of tools list
combos_list <- data.frame(toolcombo=rep(0, 64),
tune_not_viral=rep(0, 64),
DVF=rep(0, 64),
tune_viral=rep(0, 64),
VIBRANT=rep(0, 64),
VS=rep(0, 64),
VS2=rep(0, 64))
p <- 1
for (i in c(0,1)){
for (j in c(0,1)){
for (k in c(0,1)){
for (l in c(0,1)){
for (m in c(0,1)){
for (n in c(0,1)){
combos_list$toolcombo[p] <- paste(i,j,k,l,m,n)
combos_list$toolcombo2[p] <- paste(if(i){"tv"}else{"0"},if(j){"DVF"}else{"0"},
if(k){"tnv"}else{"0"},if(l){"VB"}else{"0"},
if(m){"VS"}else{"0"},if(n){"VS2"}else{"0"})
combos_list$tune_not_viral[p] <- i
combos_list$DVF[p] <- j
combos_list$tune_viral[p] <- k
combos_list$VIBRANT[p] <- l
combos_list$VS[p] <- m
combos_list$VS2[p] <- n
p <- p+1
}
}
}
}
}
}
combos_list <- combos_list[-1,]
this function builds a list of all of the combinations that the user wants to test. In this case, we’re comparing the performance of all unique combinations of the six tools.
build_score_list <- function(input_seqs, combos) {
output <- data.frame(precision=rep(0, nrow(combos)),
recall=rep(0, nrow(combos)),
F1=rep(0, nrow(combos)),
MCC=rep(0, nrow(combos)),
AUC=rep(0, nrow(combos)),
fungal_FP=rep(0, nrow(combos)),
protist_FP=rep(0, nrow(combos)),
bacterial_FP=rep(0, nrow(combos)),
viral_FN=rep(0, nrow(combos)))
for (i in 1:nrow(combos)) {
keep_score <- getting_viral_set_1(input_seqs, include_vibrant = combos$VIBRANT[i],
include_virsorter = combos$VS[i],
include_virsorter2 = combos$VS2[i],
include_tuning_viral = combos$tune_viral[i],
include_tuning_not_viral = combos$tune_not_viral[i],
include_deepvirfinder = combos$DVF[i])
output[i,1:9] <- assess_performance(input_seqs$seqtype, keep_score)
output$toolcombo[i] <- paste(combos$tune_viral[i],combos$DVF[i],
combos$tune_not_viral[i], combos$VIBRANT[i],
combos$VS[i], combos$VS2[i])
}
output[is.na(output)] <- 0
return (output)
}
accuracy_scores <- data.frame(testing_set_index=rep(0, nrow(combos_list)*10),
precision=rep(0, nrow(combos_list)*10),
recall=rep(0, nrow(combos_list)*10),
F1=rep(0, nrow(combos_list)*10),
MCC=rep(0, nrow(combos_list)*10),
AUC=rep(0, nrow(combos_list)*10),
fungal_FP=rep(0, nrow(combos_list)*10),
protist_FP=rep(0, nrow(combos_list)*10),
bacterial_FP=rep(0, nrow(combos_list)*10),
viral_FN=rep(0, nrow(combos_list)*10))
accuracy_scores <- cbind(testing_set_index=rep(1, nrow(combos_list)),
build_score_list(viruses[viruses$Index==1,], combos_list))
for (i in 2:10) {
accuracy_scores <- rbind(accuracy_scores,
cbind(testing_set_index=rep(i, nrow(combos_list)),
build_score_list(viruses[viruses$Index==i,], combos_list)))
}
library("stringr")
accuracy_scores$numtools <- str_count(accuracy_scores$toolcombo, "1")
#accuracy_scores <- accuracy_scores[order(accuracy_scores$numtools, decreasing=F),]
accuracy_scores <- accuracy_scores[order(accuracy_scores$MCC, decreasing=F),]
accuracy_scores$toolcombo <- factor(accuracy_scores$toolcombo, levels = unique(accuracy_scores$toolcombo))
accuracy_scores$numtools <- as.factor(accuracy_scores$numtools)
pal <- ggthemes::tableau_color_pal(palette="Tableau 10", type="regular")
p2 <- ggplot(accuracy_scores, aes(x=toolcombo, y=F1,
color=numtools, fill=numtools)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Tool Combination (tv, DVF, tnv, VB, VS, VS2)") +
ylab("F1 Score")
p2
ggplot(accuracy_scores, aes(x=toolcombo, y=precision,
color=numtools, fill=numtools)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Tool Combination (tv, DVF, tnv, VB, VS, VS2)") +
ylab("Precision")
ggplot(accuracy_scores, aes(x=toolcombo, y=recall,
color=numtools, fill=numtools)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Tool Combination (tv, DVF, tnv, VB, VS, VS2)") +
ylab("Recall")
ggplot(accuracy_scores, aes(x=precision, y=recall,
color=numtools, fill=numtools)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Precision") +
ylab("Recall")
ggplot(accuracy_scores, aes(x=toolcombo, y=abs(precision-recall),
color=numtools, fill=numtools)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Tool Combination (tv, DVF, tnv, VB, VS, VS2)") +
ylab("Precision-Recall")
ggplot(accuracy_scores, aes(x=toolcombo, y=MCC,
color=numtools, fill=numtools)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Tool Combination (tv, DVF, tnv, VB, VS, VS2)") +
ylab("MCC")
ggplot(accuracy_scores, aes(x=toolcombo, y=AUC,
color=numtools, fill=numtools)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Tool Combination (tv, DVF, tnv, VB, VS, VS2)") +
ylab("AUC")
ggplot(accuracy_scores, aes(x=toolcombo, y=fungal_FP,
color=numtools, fill=numtools)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Tool Combination (tv, DVF, tnv, VB, VS, VS2)") +
ylab("Fungal False Positives")
ggplot(accuracy_scores, aes(x=toolcombo, y=protist_FP,
color=numtools, fill=numtools)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Tool Combination (tv, DVF, tnv, VB, VS, VS2)") +
ylab("Protist False Positives")
ggplot(accuracy_scores, aes(x=toolcombo, y=bacterial_FP,
color=numtools, fill=numtools)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Tool Combination (tv, DVF, tnv, VB, VS, VS2)") +
ylab("Bacterial False Positives")
ggplot(accuracy_scores, aes(x=toolcombo, y=viral_FN,
color=numtools, fill=numtools)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Tool Combination (tv, DVF, tnv, VB, VS, VS2)") +
ylab("Viral False Negatives")
write_tsv(accuracy_scores, "20221029_accuracy_scores.tsv")
to do: add in clustering and ordination like in the drinking water R notebook
viruses$keep_score_high_precision <- getting_viral_set_1(viruses, include_deepvirfinder = T,
include_vibrant = T,
include_virsorter2 = F,
include_tuning_viral = F,
include_tuning_not_viral = T,
include_virsorter = F)
viruses$confusion_matrix_high_precision <- "true negative"
viruses$confusion_matrix_high_precision[viruses$seqtype=="virus" & viruses$keep_score_high_precision<1] <- "false negative"
viruses$confusion_matrix_high_precision[viruses$seqtype=="virus" & viruses$keep_score_high_precision>=1] <- "true positive"
viruses$confusion_matrix_high_precision[viruses$seqtype!="virus" & viruses$keep_score_high_precision>=1] <- "false positive"
visualizing confusion matrix by taxa
confusion_by_taxa <- melt(table(viruses$confusion_matrix_high_precision, viruses$seqtype, viruses$Index))
The melt generic in data.table has been passed a table and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(table(viruses$confusion_matrix_high_precision, viruses$seqtype, viruses$Index)). In the next version, this warning will become an error.
colnames(confusion_by_taxa) <- c("confusion_matrix", "seqtype","Index", "count")
length(grep("true", viruses$confusion_matrix_high_precision))/nrow(viruses)
[1] 0.9206364
pal <- ggthemes::tableau_color_pal(palette="Tableau 10", type="regular")
ggplot(confusion_by_taxa, aes(x=count, y=as.factor(Index),
fill=confusion_matrix,
color=confusion_matrix)) +
geom_bar(stat="identity") +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
scale_fill_manual(name="",
values = alpha(rev(pal(4)), 0.5),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
scale_color_manual(name="",
values = alpha(rev(pal(4)), 1),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
xlab("Number of Sequences") +
ylab("") +
facet_wrap(~seqtype, scales = "free") +
coord_flip()
this rule set had the highest precision, but as you can see, this comes with a big sacrifice in recall
viruses$keep_score_high_MCC <- getting_viral_set_1(viruses, include_deepvirfinder = F,
include_vibrant = T,
include_virsorter2 = T,
include_tuning_viral = T,
include_tuning_not_viral = T,
include_virsorter = T)
viruses$confusion_matrix_high_MCC <- "true negative"
viruses$confusion_matrix_high_MCC[viruses$seqtype=="virus" & viruses$keep_score_high_MCC<1] <- "false negative"
viruses$confusion_matrix_high_MCC[viruses$seqtype=="virus" & viruses$keep_score_high_MCC>=1] <- "true positive"
viruses$confusion_matrix_high_MCC[viruses$seqtype!="virus" & viruses$keep_score_high_MCC>=1] <- "false positive"
accuracy:
length(grep("true", viruses$confusion_matrix_high_MCC))/nrow(viruses)
[1] 0.9422999
recall
length(grep("true positive", viruses$confusion_matrix_high_MCC))/length(grep("virus", viruses$seqtype))
[1] 0.9039
viruses$size_class <- "3-10kb"
viruses$size_class[viruses$checkv_length>10000] <- ">10kb"
visualizing confusion matrix by taxa
confusion_by_taxa <- viruses %>% count(confusion_matrix_high_MCC, seqtype, Index)
colnames(confusion_by_taxa) <- c("confusion_matrix", "seqtype","index", "count")
pal <- ggthemes::tableau_color_pal(palette="Tableau 10", type="regular")
ggplot(confusion_by_taxa, aes(x=count, y=as.factor(index),
fill=confusion_matrix,
color=confusion_matrix)) +
geom_bar(stat="identity") +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
scale_fill_manual(name="",
values = alpha(rev(pal(4)), 0.5),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
scale_color_manual(name="",
values = alpha(rev(pal(4)), 1),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
xlab("Number of Sequences") +
ylab("") +
facet_wrap(~seqtype, scales = "free") +
coord_flip()
percent viral
visualizing confusion matrix by taxa
confusion_by_taxa <- viruses %>% count(confusion_matrix_high_MCC, seqtype, size_class, Index)
colnames(confusion_by_taxa) <- c("confusion_matrix", "seqtype","size", "index", "count")
confusion_vir_called <- confusion_by_taxa %>% filter(confusion_matrix=="true positive" | confusion_matrix=="false positive")
type_count <- viruses %>% count(seqtype, size_class, Index)
confusion_vir_called$per_viral <- 0
for (i in c(1:nrow(confusion_vir_called))) {
confusion_vir_called$per_viral[i] <- confusion_vir_called$count[i]/type_count$n[type_count$seqtype==confusion_vir_called$seqtype[i] &
type_count$Index==confusion_vir_called$index[i] &
type_count$size_class==confusion_vir_called$size[i]]*100
}
confusion_vir_called <- confusion_vir_called %>% group_by(seqtype, size) %>%
summarise(mean=mean(per_viral),
sd=sd(per_viral))
`summarise()` has grouped output by 'seqtype'. You can override using the `.groups` argument.
ggplot(confusion_vir_called, aes(y=mean, x=size,
fill=seqtype,
color=seqtype)) +
geom_bar(stat="identity", position=position_dodge()) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
geom_errorbar(aes(ymin=mean-sd, ymax=mean+sd), width=.2,
position=position_dodge(.9)) +
scale_fill_manual(name="",
values = alpha(rev(pal(6)), 0.5)) +
scale_color_manual(name="",
values = alpha(rev(pal(6)), 1)) +
xlab("Length (bp)") +
ylab("Sequences Called Viral (%)")
viruses$keep_score_vb <- getting_viral_set_1(viruses, include_deepvirfinder = F,
include_vibrant = T,
include_virsorter2 = F,
include_tuning_viral = F,
include_tuning_not_viral = F,
include_virsorter = F)
viruses$keep_score_vb_dvf <- getting_viral_set_1(viruses, include_deepvirfinder = T,
include_vibrant = T,
include_virsorter2 = F,
include_tuning_viral = F,
include_tuning_not_viral = F,
include_virsorter = F)
viruses$keep_score_vb_dvf_vs2 <- getting_viral_set_1(viruses, include_deepvirfinder = T,
include_vibrant = T,
include_virsorter2 = T,
include_tuning_viral = F,
include_tuning_not_viral = F,
include_virsorter = F)
viruses$keep_score_vb_dvf_vs2_vs <- getting_viral_set_1(viruses, include_deepvirfinder = T,
include_vibrant = T,
include_virsorter2 = T,
include_tuning_viral = F,
include_tuning_not_viral = F,
include_virsorter = T)
viruses$keep_score_vb_dvf_vs2_vs_tv <- getting_viral_set_1(viruses, include_deepvirfinder = T,
include_vibrant = T,
include_virsorter2 = T,
include_tuning_viral = T,
include_tuning_not_viral = F,
include_virsorter = T)
viruses$keep_score_vb_dvf_vs2_vs_tv_tnv <- getting_viral_set_1(viruses, include_deepvirfinder = T,
include_vibrant = T,
include_virsorter2 = T,
include_tuning_viral = T,
include_tuning_not_viral = T,
include_virsorter = T)
viruses_high <- viruses[viruses$keep_score_vb_dvf_vs2_vs_tv>=1,]
viruses_high_mod <- viruses_high %>% select(keep_score_vb,keep_score_vb_dvf,
keep_score_vb_dvf_vs2, keep_score_vb_dvf_vs2_vs,
keep_score_vb_dvf_vs2_vs_tv, keep_score_vb_dvf_vs2_vs_tv_tnv)
#viruses_high_mod <- apply(viruses_high_mod, c(1,2), function(x) {if (x >= 1) {x <- 1} else {x <- 0}})
viruses_high_mod <- as_tibble(viruses_high_mod)
sm_m <- reshape2::melt(viruses_high_mod)
No id variables; using all as measure variables
colnames(sm_m) <- c("method", "viral_score")
sm_m <- sm_m[sm_m$viral_score>0,]
sm_m$score <- sm_m$viral_score
sm_m$score[sm_m$viral_score==0.5] <- "0.5"
sm_m$score[sm_m$viral_score>=1] <- "1"
sm_m$score[sm_m$viral_score>=2] <- "2"
sm_m$score[sm_m$viral_score>=3] <- "3"
sm_m$score[sm_m$viral_score>=4] <- "4"
sm_m$score[sm_m$viral_score>=5] <- "5"
sm_m$score <- factor(sm_m$score,
levels=c("0.5", "1", "2","3","4","5"))
ggplot(sm_m, aes(x=method, y=score,
fill=score)) +
geom_bar(stat="identity") +
theme_light() +
coord_flip() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
legend.position = "bottom"
) +
scale_fill_manual(name = 'Viral Score',
values = alpha(c(viridis(6)), 1)) +
xlab("") +
ylab("Number of Sequences") +
coord_flip()
Coordinate system already present. Adding new coordinate system, which will replace the existing one.
viruses$keep_score_visualize <- viruses$keep_score_high_MCC
viruses$keep_score_visualize[viruses$keep_score_high_MCC>1] <- "> 1"
viruses$keep_score_visualize[viruses$keep_score_high_MCC==1] <- "1"
viruses$keep_score_visualize[viruses$keep_score_high_MCC==0.5] <- "0.5"
viruses$keep_score_visualize[viruses$keep_score_high_MCC==0] <- "0"
viruses$keep_score_visualize[viruses$keep_score_high_MCC==-0.5] <- "-0.5"
viruses$keep_score_visualize[viruses$keep_score_high_MCC==-1] <- "-1"
viruses$keep_score_visualize[viruses$keep_score_high_MCC<=-1] <- "< -1"
viruses$keep_score_visualize <- factor(viruses$keep_score_visualize,
levels=c("< -1", "-1", "-0.5", "0", "0.5","1", "> 1"))
#viruses$keep_score_visualize <- factor(viruses$keep_score_visualize,
# labels=c("≤ 0", "≤ 0", "≤ 0", "0.5","1", "> 1"))
levels(factor(viruses$keep_score_visualize))
[1] "< -1" "-0.5" "0" "0.5" "1" "> 1"
ggplot(viruses, aes(x=as.factor(Index),
fill=keep_score_visualize, color=keep_score_visualize)) +
geom_bar(stat="count", position="stack") +
theme_light() +
coord_flip() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16)
) +
scale_color_manual(name = 'Viral Score',
values = alpha(c(viridis(6)), 1)) +
scale_fill_manual(name = 'Viral Score',
values = alpha(c(viridis(6)), 0.5)) +
xlab("Index") +
ylab("Sequence Count") +
facet_wrap(~confusion_matrix_high_MCC, scales = "free")
viruses$keep_score_all <- getting_viral_set_1(viruses, include_deepvirfinder = T,
include_vibrant = T,
include_virsorter2 = T,
include_tuning_viral = T,
include_tuning_not_viral = T,
include_virsorter = T)
viruses$confusion_matrix_all <- "true negative"
viruses$confusion_matrix_all[viruses$seqtype=="virus" & viruses$keep_score_all<1] <- "false negative"
viruses$confusion_matrix_all[viruses$seqtype=="virus" & viruses$keep_score_all>=1] <- "true positive"
viruses$confusion_matrix_all[viruses$seqtype!="virus" & viruses$keep_score_all>=1] <- "false positive"
visualizing confusion matrix by taxa
confusion_by_taxa <- melt(table(viruses$confusion_matrix_all, viruses$seqtype, viruses$Index))
The melt generic in data.table has been passed a table and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(table(viruses$confusion_matrix_all, viruses$seqtype, viruses$Index)). In the next version, this warning will become an error.
colnames(confusion_by_taxa) <- c("confusion_matrix", "seqtype","Index", "count")
table(viruses$confusion_matrix_all)
false negative false positive true negative true positive
786 5508 80783 9214
length(grep("true", viruses$confusion_matrix_all))/nrow(viruses)
[1] 0.9346356
length(grep("true positive", viruses$confusion_matrix_all))/length(grep("virus", viruses$seqtype))
[1] 0.9214
pal <- ggthemes::tableau_color_pal(palette="Tableau 10", type="regular")
ggplot(confusion_by_taxa, aes(x=count, y=as.factor(Index),
fill=confusion_matrix,
color=confusion_matrix)) +
geom_bar(stat="identity") +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
scale_fill_manual(name="",
values = alpha(rev(pal(4)), 0.5),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
scale_color_manual(name="",
values = alpha(rev(pal(4)), 1),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
xlab("Number of Sequences") +
ylab("") +
facet_wrap(~seqtype, scales = "free") +
coord_flip()
viruses$keep_score_high_recall <- getting_viral_set_1(viruses, include_deepvirfinder = T,
include_vibrant = T,
include_virsorter2 = T,
include_tuning_viral = T,
include_tuning_not_viral = F,
include_virsorter = T)
viruses$confusion_matrix_high_recall <- "true negative"
viruses$confusion_matrix_high_recall[viruses$seqtype=="virus" & viruses$keep_score_high_recall<1] <- "false negative"
viruses$confusion_matrix_high_recall[viruses$seqtype=="virus" & viruses$keep_score_high_recall>=1] <- "true positive"
viruses$confusion_matrix_high_recall[viruses$seqtype!="virus" & viruses$keep_score_high_recall>=1] <- "false positive"
visualizing confusion matrix by taxa
confusion_by_taxa <- melt(table(viruses$confusion_matrix_high_recall, viruses$seqtype, viruses$Index))
The melt generic in data.table has been passed a table and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(table(viruses$confusion_matrix_high_recall, viruses$seqtype, viruses$Index)). In the next version, this warning will become an error.
colnames(confusion_by_taxa) <- c("confusion_matrix", "seqtype","Index", "count")
pal <- ggthemes::tableau_color_pal(palette="Tableau 10", type="regular")
p2 <- ggplot(confusion_by_taxa, aes(x=count, y=as.factor(Index),
fill=confusion_matrix,
color=confusion_matrix)) +
geom_bar(stat="identity") +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
scale_fill_manual(name="",
values = alpha(rev(pal(4)), 0.5),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
scale_color_manual(name="",
values = alpha(rev(pal(4)), 1),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
xlab("Number of Sequences") +
ylab("") +
facet_wrap(~seqtype, scales = "free") +
coord_flip()
p2
accuracy:
length(grep("true", viruses$confusion_matrix_high_recall))/nrow(viruses)
[1] 0.8834574
0.887
recall
length(grep("true positive", viruses$confusion_matrix_high_recall))/length(grep("virus", viruses$seqtype))
[1] 0.9604
recover almost all of the viruses this way, but more protist contamination
0.960
Extra Stuff #####################################################################
ggplot(viruses, aes(x=checkv_length, y=keep_score_high_MCC,
fill=confusion_matrix_high_MCC,
color=confusion_matrix_high_MCC)) +
geom_point(stat="identity", shape=21) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
scale_fill_manual(name="",
values = alpha(rev(pal(4)), 0.5),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
scale_color_manual(name="",
values = alpha(rev(pal(4)), 1),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
xlab("Sequence Length (bp)") +
ylab("Pipeline Viral Score") +
facet_wrap(~seqtype) +
scale_x_log10()
ggplot(viruses, aes(x=checkv_completeness, y=hallmark,
fill=confusion_matrix_high_recall,
color=confusion_matrix_high_recall)) +
geom_point(stat="identity", shape=21) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
scale_fill_manual(name="",
values = alpha(rev(pal(4)), 0.5),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
scale_color_manual(name="",
values = alpha(rev(pal(4)), 1),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
xlab("CheckV Completeness") +
ylab("Number of Hallmark Genes") +
facet_wrap(~seqtype) +
scale_x_log10()
ggplot(viruses, aes(x=checkv_completeness, y=keep_score_high_MCC,
fill=confusion_matrix_high_recall,
color=confusion_matrix_high_recall)) +
geom_point(stat="identity", shape=21) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
scale_fill_manual(name="",
values = alpha(rev(pal(4)), 0.5),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
scale_color_manual(name="",
values = alpha(rev(pal(4)), 1),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
xlab("CheckV Completeness") +
ylab("Pipeline Viral Score") +
facet_wrap(~seqtype) +
scale_x_log10()
ggplot(viruses, aes(x=confusion_matrix_high_recall, y=checkv_length,
fill=confusion_matrix_high_recall,
color=confusion_matrix_high_recall)) +
geom_boxplot() +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
scale_fill_manual(name="",
values = alpha(rev(pal(4)), 0.5),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
scale_color_manual(name="",
values = alpha(rev(pal(4)), 1),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
xlab("Sequence Length (bp)") +
ylab("Pipeline Viral Score") +
scale_y_log10()
looking at false negatives
viruses_false_negs <- viruses[(viruses$seqtype=="virus" & viruses$keep_score_high_recall<1),]
looking at protists calling viral
viruses_false_pos_protists <- viruses[(viruses$seqtype=="protist" & viruses$keep_score_high_recall>=1),]
viruses$keep_score_vb <- getting_viral_set_1(viruses, include_deepvirfinder = F,
include_vibrant = T,
include_virsorter2 = F,
include_tuning_viral = F,
include_tuning_not_viral = F,
include_virsorter = F)
viruses$keep_score_vb_tv <- getting_viral_set_1(viruses, include_deepvirfinder = F,
include_vibrant = T,
include_virsorter2 = T,
include_tuning_viral = T,
include_tuning_not_viral = F,
include_virsorter = F)
viruses_high <- viruses[viruses$keep_score_vb_tv>=1,] #uncomment this line if want to use all 6 tools
viruses_high_mod <- viruses_high %>% select(keep_score_vb,
keep_score_vb_tv)
#viruses_high_mod <- apply(viruses_high_mod, c(1,2), function(x) {if (x >= 1) {x <- 1} else {x <- 0}})
viruses_high_mod <- as_tibble(viruses_high_mod)
sm_m <- reshape2::melt(viruses_high_mod)
No id variables; using all as measure variables
colnames(sm_m) <- c("method", "score")
ggplot(sm_m, aes(x=method, y=score,
fill=as.factor(score))) +
geom_bar(stat="identity") +
theme_light() +
coord_flip() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom"
) +
scale_fill_manual(name = 'Number of Methods',
values = alpha(c(viridis(14)), 1)) +
xlab("Primary Method") +
ylab("Count of Viral Contigs") +
coord_flip()
Coordinate system already present. Adding new coordinate system, which will replace the existing one.
library(pROC)
viruses$truepositive <- rep(0, nrow(viruses))
viruses$truepositive[viruses$seqtype=="virus"] <- 1
rocobj <- roc(viruses$truepositive, viruses$keep_score)
Unknown or uninitialised column: `keep_score`.Error in roc.default(viruses$truepositive, viruses$keep_score) :
No valid data provided.
Sensitivity: The probability that the model predicts a positive outcome for an observation when indeed the outcome is positive. Specificity: The probability that the model predicts a negative outcome for an observation when indeed the outcome is negative.
to do: try coloring above based on the F1 scores of the testing set on each combination